* Create column-wise table of summary statistics and ttest for diff in group means
*! Michael Kevane October 2014

	version 12
	set more off
	
* Define the locals for the variables from the lists that are passed through
	local variab "`1'"	/*1 are the categorical variables*/
	local p "`2'"		/*2 p are the 0-1 dummy outcome variables*/
	local c "`3'" 		/*3 c are the continuous outcome variables*/
						/*4 is the filename*/	

* Use summarize to get the mean and sd for each variable by the group variables
	foreach var in `variab' {
	eststo clear
	local replace replace
	foreach tt in `c' `p' `variab' {
	* if all the observations in a category are missing, then estpost gets messed up
	* because nothing outputted for that category, and so then next variable gets put
	* in its place in the table (everthing shifts over for that category
	* so workaround is to put in 0 for that category
	* which is what next two lines do
	preserve
	bysort `var': egen xxpp=total(`tt') , missing
	replace `tt'=0 if xxpp==.
	bysort `var': eststo: quietly estpost summarize `tt' , listwise 
	quietly esttab using "est`var'.csv", cells("mean sd count")  nodepvar `replace' plain   
	eststo clear
	restore
	local replace append
	}
	* find the number of categories, the last category should be defined to be biggest with most obs
	* nvals eqals 1 when is last obs of category
	by `var', sort: gen nvals = _n == 1 
	* now nvals = running sum nvals
	replace nvals = sum(nvals)	
	* now nvals is the final sum at last observation _N = number ofcategories
	replace nvals = nvals[_N] 
	local catnum=`=nvals'
	local catnumz=`catnum'*3+1
	local catnumx=`catnumz'-2
	local catnumy=`catnumz'-1
	local catnumw=`catnumz'-3
	di "`catnum'"
	di "`catnumx'"
	di "`catnumy'"
	di "`catnumz'"
	drop nvals

* Manipulate the summarize outcome to implement ttest or prop-test and create asterisks
	quietly {
	preserve
	insheet using "est`var'.csv", clear
	drop if v1=="" | v1=="N"
	* create a new row for the sample size
	local new = _N + 1
    	set obs `new'
	replace v1="Sample size" if _n==_N
	destring, replace

	* Generate variables to distinguish proportions from continuous variables
	gen prop=.
	foreach z in `p' `variab' {
		 replace prop=1 if v1=="`z'"
		 }
	gen cont=.
	foreach z in `c' {
		 replace cont=1 if v1=="`z'"
		 }
	loc proportions "if prop==1"
	loc continuous "if cont==1"
	}
	
	* for each by-group there are three observations - mean-sd-count
	* v14-v15-v16 is the "baseline" group to compare with others if there are 5 categories
	* v11-v12-v13 is the "baseline" group to compare with others if there are 4 categories
	* v8-v9-v10 is the "baseline" group to compare with others if there are 3 categories
	* v5-v6-v7 is the "baseline" group to compare with others if there are 2 categories
	* v2-v3-v4 is the first group
	* catnumw is one less than 5,8,11 or 14 so if there are not more categories it does ttest for next highest category
	forval i=2(3)`catnumw'  {
	local j=`i'+1
	local k=`i'+2
	* Find the t-stat for means and the pval for that tstat (distributed student t)
	gen va`i'=(v`i'-v`catnumx')/(((v`j'^2/v`k')+(v`catnumy'^2/v`catnumz'))^(1/2)) `continuous'
	gen df`i'= round((((v`j'^2/v`k')+(v`catnumy'^2/v`catnumz'))^2)/ (((v`j'^2/v`k')^2/(v`k'-1)+(v`catnumy'^2/v`catnumz')^2/(v`catnumz'-1))))
	gen vb`i'= 2*ttail(df`i',abs(va`i')) `continuous'
	* so now have va2, vb2 where va2 is the t-stat and vb2 is the pval
	* Find the z-stat for proportions and the pval for that zstat (distributed normally)
	gen poolsamva`i'=(v`i'*v`k' + v`catnumx'*v`catnumz')/(v`k'+v`catnumz')
	replace va`i'=(v`i'-v`catnumx')/((poolsamva`i'*(1-poolsamva`i'))*((1/v`k')+(1/v`catnumz'))^(1/2)) `proportions'
	replace va`i'=(v`i'-v`catnumx')  /  (   (poolsamva`i'*(1-poolsamva`i')) * ((1/v`k')+(1/v`catnumz'))    )^(1/2) `proportions'
	replace vb`i'=2*normal(-abs(va`i')) `proportions'
	* put the tstat and pval right after the sd
	order va`i', after(v`i')
	order vb`i', after(va`i')
	gen vd`i'=""
	replace vd`i'="*" if vb`i'<=.10
	replace vd`i'="**" if vb`i'<=.05
	replace vd`i'="***" if vb`i'<=.01
	order vd`i', after(v`j')  /*changed here*/
	*round the standard deviations
	replace v`j'=round(v`j',.01)
	replace v`catnumy'=round(v`catnumy',.01)
	egen vc`k'=max(v`k')
	replace v`i' = vc`k' if v`i'==.
	drop poolsamva`i' df`i' vc`k'
	drop  v`k' va`i' vb`i' /*v`j' */
	egen vc`catnumz'=max(v`catnumz')
	replace v`catnumx'=vc`catnumz' if v`catnumx'==.
	drop vc`catnumz' 
	}
	drop  v`catnumz' /*v`catnumy'*/
	
	label var v1 " "
	forval i=2(3)`catnumw'  {
	label var vd`i' " "
	}
	local gg=1
	forval i=2(3)`catnumx'  {
	loc j=`i'+1
	replace v`i'=round(v`i',.01)
	rename v`i' v`i'`var'
	rename v`j' v`j'_sd_`var'
	label var v`i'`var' "mean `var'=`gg'"
	label var v`j' "st dev"
	local gg=`gg'+1
	}
	forval i=2(3)`catnumw'  {
	rename vd`i' vd`i'`var'
	}
	drop prop cont
	save e_`var', replace
	*erase "est`var'.csv"
	restore	
	}
	
	
* now merge the datasets which are in same variable order
	quietly {
	local i=1
	foreach var in `variab' {
	if "`i'"=="1" {
		use e_`var', clear
		}
	else if "`i'"~="1" {
		merge 1:1 _n using e_`var'
		drop _merge
		}
	local i=`i'+1
	}
/*	foreach z in `variab'  {
	forval t=2(3)`catnumx'  {
	replace v`t'`z'=round(v`t'`z',.01)
	}
	}
*/
	}
	local new = _N + 5
    set obs `new'
	replace v1="Notes: * p<0.10 ** p<0.05 *** p<0.01." if  _n==_N-4

	export excel using "`4'", sh(rawdat1) sheetreplace first(varl) 
	
